library(latexpdf)
library(stringr)
library(ggplot2)
library(gridExtra)
library(faraway)
library(MASS)
library(olsrr)
library(Metrics)
library(caret)
library(corrplot)


Data Description

About the Data

First, we will read the white wine data and look at the fundamentals of the data.

white.wine = read.csv("winequality-white.csv", header=TRUE, sep=";")
head(white.wine)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.0             0.27        0.36           20.7     0.045
## 2           6.3             0.30        0.34            1.6     0.049
## 3           8.1             0.28        0.40            6.9     0.050
## 4           7.2             0.23        0.32            8.5     0.058
## 5           7.2             0.23        0.32            8.5     0.058
## 6           8.1             0.28        0.40            6.9     0.050
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  45                  170  1.0010 3.00      0.45     8.8
## 2                  14                  132  0.9940 3.30      0.49     9.5
## 3                  30                   97  0.9951 3.26      0.44    10.1
## 4                  47                  186  0.9956 3.19      0.40     9.9
## 5                  47                  186  0.9956 3.19      0.40     9.9
## 6                  30                   97  0.9951 3.26      0.44    10.1
##   quality
## 1       6
## 2       6
## 3       6
## 4       6
## 5       6
## 6       6
names(white.wine)
##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"
dim(white.wine)
## [1] 4898   12
sapply(white.wine, class)
##        fixed.acidity     volatile.acidity          citric.acid 
##            "numeric"            "numeric"            "numeric" 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##            "numeric"            "numeric"            "numeric" 
## total.sulfur.dioxide              density                   pH 
##            "numeric"            "numeric"            "numeric" 
##            sulphates              alcohol              quality 
##            "numeric"            "numeric"            "integer"
white.info = summary(white.wine)
white.info
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0        Min.   :0.9871  
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0        1st Qu.:0.9917  
##  Median :0.04300   Median : 34.00      Median :134.0        Median :0.9937  
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4        Mean   :0.9940  
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0        3rd Qu.:0.9961  
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.180   Median :0.4700   Median :10.40   Median :6.000  
##  Mean   :3.188   Mean   :0.4898   Mean   :10.51   Mean   :5.878  
##  3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40   3rd Qu.:6.000  
##  Max.   :3.820   Max.   :1.0800   Max.   :14.20   Max.   :9.000


Include some basic descriptions including the class and the descriptive statistics of the variables. + Missing value


Data Visualization - Histograms

First, we will generate histograms for the all the variables. This will tell us the approximate distribution of each variables.

white.hist_y = ggplot(aes(quality), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 1) +
  ggtitle('Histogram of Quality') + theme(plot.title = element_text(size=9))

white.hist_x1 = ggplot(aes(fixed.acidity), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.5) +
  ggtitle('Histogram of Fixed Acidity') + theme(plot.title = element_text(size=8))

white.hist_x2 = ggplot(aes(volatile.acidity), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.1) +
  ggtitle('Histogram of Volatile Acidity') + theme(plot.title = element_text(size=7.5))

white.hist_x3 = ggplot(aes(citric.acid), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.1) +
  ggtitle('Histogram of Citric Acid') + theme(plot.title = element_text(size=8))

white.hist_x4 = ggplot(aes(residual.sugar), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 5) +
  ggtitle('Histogram of Residual Sugar') + theme(plot.title = element_text(size=6.7))

white.hist_x5 = ggplot(aes(chlorides), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.05) +
  ggtitle('Histogram of Chlorides') + theme(plot.title = element_text(size=8.5))

white.hist_x6 = ggplot(aes(free.sulfur.dioxide), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 10.4) +
  ggtitle('Histogram of Free Sulfur Dioxide') + theme(plot.title = element_text(size=6))

white.hist_x7 = ggplot(aes(total.sulfur.dioxide), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 16) +
  ggtitle('Histogram of Total Sulfur Dioxide') + theme(plot.title = element_text(size=6))

white.hist_x8 = ggplot(aes(density), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.004) +
  ggtitle('Histogram of Density') + theme(plot.title = element_text(size=9))

white.hist_x9 = ggplot(aes(pH), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.08) +
  ggtitle('Histogram of pH') + theme(plot.title = element_text(size=9))

white.hist_x10 = ggplot(aes(sulphates), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.1) +
  ggtitle('Histogram of Sulphates') + theme(plot.title = element_text(size=8))

white.hist_x11 = ggplot(aes(alcohol), data = white.wine) +
  geom_histogram(aes(color=I('black'), fill=I('beige')), binwidth = 0.3) +
  ggtitle('Histogram of Alcohol') + theme(plot.title = element_text(size=9))

grid.arrange(white.hist_y, white.hist_x1, white.hist_x2, white.hist_x3, white.hist_x4, white.hist_x5, white.hist_x6, white.hist_x7, white.hist_x8, white.hist_x9, white.hist_x10, white.hist_x11, ncol = 4, top="Histograms for White Wine Data")


There are a couple of things that we can see from the histograms above.


Data Visualization - Regressors vs. Predictor

Now, we will plot each of the regressors against the predictor. This will help us see the approximate relationship between each varibles and wine quality.

white.group_x1 = ggplot(aes(fixed.acidity, quality), data = white.wine) + 
  ggtitle("Fixed Acidity vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))

white.group_x2 = ggplot(aes(volatile.acidity, quality), data = white.wine) + 
  ggtitle("Volatile Acidity vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=8.5))

white.group_x3 = ggplot(aes(citric.acid, quality), data = white.wine) + 
  ggtitle("Citric Acid vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))

white.group_x4 = ggplot(aes(residual.sugar, quality), data = white.wine) + 
  ggtitle("Residual Sugar vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=8))

white.group_x5 = ggplot(aes(chlorides, quality), data = white.wine) + 
  ggtitle("Chlorides vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))

white.group_x6 = ggplot(aes(free.sulfur.dioxide, quality), data = white.wine) + 
  ggtitle("Free Sulfur Dioxide vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=7))

white.group_x7 = ggplot(aes(total.sulfur.dioxide, quality), data = white.wine) + 
  ggtitle("Total Sulfur Dioxide vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=6.5))

white.group_x8 = ggplot(aes(density, quality), data = white.wine) + 
  ggtitle("Density vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))

white.group_x9 = ggplot(aes(pH, quality), data = white.wine) + 
  ggtitle("pH vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") +  
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))

white.group_x10 = ggplot(aes(sulphates, quality), data = white.wine) + 
  ggtitle("Sulphates vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))

white.group_x11 = ggplot(aes(alcohol, quality), data = white.wine) + 
  ggtitle("Alcohol vs Quality") +
  geom_jitter(width = 0.25, alpha = 0.1, colour = "firebrick3") + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") + theme(plot.title = element_text(size=9))
grid.arrange(white.hist_y, white.group_x1, white.group_x2, white.group_x3, white.group_x4, white.group_x5, white.group_x6, white.group_x7, white.group_x8, white.group_x9, white.group_x10, white.group_x11, ncol = 4)


There are a couple of things that we can see from the plots above.


Data Visualization - Boxplots

Now, we will generate boxplots of each variable and see more closely of the distribution of variables.

par(mfrow = c(3,4))
boxplot(white.wine$quality, col="slategray2", pch=19, xlab = "quality")
boxplot(white.wine$fixed.acidity, col="slategray2", pch=19, xlab = "Fixed Acidity")
boxplot(white.wine$volatile.acidity, col="slategray2", pch=19, xlab = "Volatile Acidity")
boxplot(white.wine$citric.acid, col="slategray2", pch=19, xlab = "Citric Acid")
boxplot(white.wine$residual.sugar, col="slategray2", pch=19, xlab = "Residual Sugar")
boxplot(white.wine$chlorides, col="slategray2", pch=19, xlab = "Chlorides")
boxplot(white.wine$free.sulfur.dioxide, col="slategray2", pch=19, xlab = "Free Sulfur Dioxide")
boxplot(white.wine$total.sulfur.dioxide, col="slategray2", pch=19, xlab = "Total Sulfur Dioxide")
boxplot(white.wine$density, col="slategray2", pch=19, xlab = "Density")
boxplot(white.wine$pH, col="slategray2", pch=19, xlab = "pH")
boxplot(white.wine$sulphates, col="slategray2", pch=19, xlab = "Sulphates")
boxplot(white.wine$alcohol, col="slategray2", pch=19, xlab = "Alcohol")


There are a couple of things that we can see from the boxplots above.


New Additional Data Point

Before we begin the data analysis, we will introduce the median point as a new additional data point to the white wine data.

white.datapoints = vector(mode = 'numeric', length = 12)
fixed.cen = (7.30 - mean(white.wine$fixed.acidity))
vol.cen = (0.32 - mean(white.wine$volatile.acidity))
citric.cen = (0.39 - mean(white.wine$citric.acid))
resid.cen = (9.9 - mean(white.wine$residual.sugar))
chlor.cen = (0.05 - mean(white.wine$chlorides))
free.cen = (46 - mean(white.wine$free.sulfur.dioxide))
tot.cen = (167 - mean(white.wine$total.sulfur.dioxide))
dens.cen = (0.9961 - mean(white.wine$density))
ph.cen = (3.28 - mean(white.wine$pH))
sul.cen = (0.55 - mean(white.wine$sulphates))
alc.cen = (11.40 - mean(white.wine$alcohol))
white.datapoints = c(fixed.cen, vol.cen, citric.cen, resid.cen, chlor.cen, free.cen, tot.cen, dens.cen, ph.cen, sul.cen, alc.cen, 6)
white.datapoints
##  [1]  0.445212332  0.041758881  0.055808493  3.508585137  0.004227644
##  [6] 10.691915067 28.639342589  0.002072624  0.091733361  0.060153124
## [11]  0.885732952  6.000000000
white.wine = rbind(white.wine, white.datapoints)
summary(white.wine)
##  fixed.acidity     volatile.acidity   citric.acid     residual.sugar  
##  Min.   : 0.4452   Min.   :0.04176   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.3000   1st Qu.:0.21000   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.8000   Median :0.26000   Median :0.3200   Median : 5.200  
##  Mean   : 6.8535   Mean   :0.27819   Mean   :0.3341   Mean   : 6.391  
##  3rd Qu.: 7.3000   3rd Qu.:0.32000   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.2000   Max.   :1.10000   Max.   :1.6600   Max.   :65.800  
##    chlorides        free.sulfur.dioxide total.sulfur.dioxide    density        
##  Min.   :0.004228   Min.   :  2.0       Min.   :  9.0        Min.   :0.002073  
##  1st Qu.:0.036000   1st Qu.: 23.0       1st Qu.:108.0        1st Qu.:0.991720  
##  Median :0.043000   Median : 34.0       Median :134.0        Median :0.993740  
##  Mean   :0.045764   Mean   : 35.3       Mean   :138.3        Mean   :0.993825  
##  3rd Qu.:0.050000   3rd Qu.: 46.0       3rd Qu.:167.0        3rd Qu.:0.996100  
##  Max.   :0.346000   Max.   :289.0       Max.   :440.0        Max.   :1.038980  
##        pH            sulphates          alcohol           quality     
##  Min.   :0.09173   Min.   :0.06015   Min.   : 0.8857   Min.   :3.000  
##  1st Qu.:3.09000   1st Qu.:0.41000   1st Qu.: 9.5000   1st Qu.:5.000  
##  Median :3.18000   Median :0.47000   Median :10.4000   Median :6.000  
##  Mean   :3.18764   Mean   :0.48976   Mean   :10.5123   Mean   :5.878  
##  3rd Qu.:3.28000   3rd Qu.:0.55000   3rd Qu.:11.4000   3rd Qu.:6.000  
##  Max.   :3.82000   Max.   :1.08000   Max.   :14.2000   Max.   :9.000
dim(white.wine)
## [1] 4899   12


We now introduced the median point as a new point to the white wine quality data.


Methods


Multiple Linear Regression - Full Model

Pairs Plot

First, with white wine, conduct a pairs plot to see if there are any particular patterns.

pairs(white.wine, main = "Pairs Plot of White Wine")

The pairs plot tells us a bit about the relationship between variables in the dataset. Specifically, we can see that a linear model seems appropriate, although some of the variables seem to have less of a linear relationship (we can look into that by conducting hypothesis tests). Also, the multicollinearity between the variables seem less than the red wine data, since some of the plots show a random scatter. Now, we will conduct a linear regression against the quality of the white wine.


Fitting Into a Linear Model

white.mdl = lm(quality~., data=white.wine)
white.sum = summary(white.mdl)


There are a couple of things that we notice from the summary of the linear model.

  • Fixed acidity, citiric acid, residual sugar, free sulfur dioxide, pH, sulphates, and alcohol have a positive relationship with white wine quality.
  • Volatile acidity, chlorides, total sulfur dioxide, density, have a negative relationship with white wine quality.
  • The P-value of the overall regression is significantly small, which suggests that there is a linear association between at least one of the regressors and the white wine quality.
  • The adjusted R-squared value is 0.2802 (28.02%), which implies that our linear model fit to the data is not at all satisfactory.
  • Looking at the individual P-values, citric acid, chlorides, and total sulfur dioxide seem to be insignificant given that all the other variables are in the model.


Anova Test

In order to see if some of the regressors are insignificant to the regression, we will first run the anova test.

anova(white.mdl)
## Analysis of Variance Table
## 
## Response: quality
##                        Df  Sum Sq Mean Sq   F value    Pr(>F)    
## fixed.acidity           1   49.23   49.23   86.1807 < 2.2e-16 ***
## volatile.acidity        1  148.85  148.85  260.5770 < 2.2e-16 ***
## citric.acid             1    0.08    0.08    0.1477    0.7008    
## residual.sugar          1   21.47   21.47   37.5888 9.421e-10 ***
## chlorides               1  137.76  137.76  241.1641 < 2.2e-16 ***
## free.sulfur.dioxide     1    2.01    2.01    3.5142    0.0609 .  
## total.sulfur.dioxide    1   72.93   72.93  127.6766 < 2.2e-16 ***
## density                 1    0.44    0.44    0.7631    0.3824    
## pH                      1   12.40   12.40   21.7057 3.263e-06 ***
## sulphates               1   14.11   14.11   24.7023 6.919e-07 ***
## alcohol                 1  590.08  590.08 1032.9822 < 2.2e-16 ***
## Residuals            4887 2791.64    0.57                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


The anova test conducts a partial F-test, and supports our hypothesis that citric acid seem to be insiginificant to the linear model given that the variables before them are already included in the model.

To check which of the variables must be selected in building the model, we will conduct a variable selection later on.


Multicollinearity

In the pairs plot above, we suspected that there may be near linear relationship between some of the regressor variables in the data. We will conduct a diagnostics since multicollinearity will cause a serious problem that may dramatically impact the usefulness of the linear model. We will use the variance inflation factor (VIF) to check.

vif(white.mdl)
##        fixed.acidity     volatile.acidity          citric.acid 
##             1.403105             1.129854             1.161188 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##             1.492696             1.206257             1.744774 
## total.sulfur.dioxide              density                   pH 
##             2.156138             1.259156             1.481981 
##            sulphates              alcohol 
##             1.059733             1.638633


Our textbook suggests that VIF larger than 10 will cause serious problems with multicollinearity. Here, VIF for residual sugar and density are larger than 10 and therefore we have a problem. We will look more into this multicollinearity later on.


Model Assessment

Residual Analysis

We will first go over the graphic analysis of the residuals to check if the error are i.i.d. normally distributed, with zero mean and constant variance.


Residual vs. Fitted
plot(white.mdl, which=1)


Comments on the residual vs. fitted plot


Normal Q-Q
plot(white.mdl, which=2)


We observe that the residuals for our model meet the normality assumption. There seems to be a slight negative skew to the distribution of the residuals. Also, there are some potential influential points. We will look at this in a moment.


Scale-Location
plot(white.mdl, which=3)


Comments on the Scale-Location Plot


Residuals vs. Leverage
plot(white.mdl, which=5)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced


Comments on the Residuals vs. Leverage Plot


Overall Residual Plots
par(mfrow = c(2,2))
plot(white.mdl, which=1)
plot(white.mdl, which=2)
plot(white.mdl, which=3)
plot(white.mdl, which=5)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced


Outlier Detection

For the outlier detection, we are going to use externally studentized residuals and plot them against various values.

white.ti = rstudent(white.mdl)
plot(white.mdl$fitted.values, white.ti, xlab = "Fitted Values", ylab="Externally Studentized Residuals", main="Studentized Residuals vs. Fitted Values")


Identifying the points on the plot, points 4899, 4676 and 448 are potential outliers. To investigate the influence of these points on the model, we will obtain an equation with these two observations deleted.


white.wine.del = white.wine[-c(4899,4676,448), ]
white.mdl.del = lm(quality ~ . , data=white.wine.del)
white.sum = summary(white.mdl)
white.sum.del = summary(white.mdl.del)
white.mse = mean(white.sum$residuals^2)
white.mse.del = mean(white.sum.del$residuals^2)
white.sum
## 
## Call:
## lm(formula = quality ~ ., data = white.wine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9108 -0.4952 -0.0340  0.4665  3.1740 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.8639971  0.7562491   7.754 1.08e-14 ***
## fixed.acidity        -0.0461155  0.0150720  -3.060  0.00223 ** 
## volatile.acidity     -1.9549206  0.1138344 -17.173  < 2e-16 ***
## citric.acid          -0.0274293  0.0961178  -0.285  0.77537    
## residual.sugar        0.0271816  0.0026015  10.448  < 2e-16 ***
## chlorides            -0.9167898  0.5427403  -1.689  0.09125 .  
## free.sulfur.dioxide   0.0047483  0.0008387   5.662 1.58e-08 ***
## total.sulfur.dioxide -0.0008550  0.0003729  -2.293  0.02191 *  
## density              -3.9026551  0.8366436  -4.665 3.17e-06 ***
## pH                    0.1882683  0.0835603   2.253  0.02430 *  
## sulphates             0.4247037  0.0972815   4.366 1.29e-05 ***
## alcohol               0.3588470  0.0111651  32.140  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7558 on 4887 degrees of freedom
## Multiple R-squared:  0.2732, Adjusted R-squared:  0.2716 
## F-statistic:   167 on 11 and 4887 DF,  p-value: < 2.2e-16
white.sum.del
## 
## Call:
## lm(formula = quality ~ ., data = white.wine.del)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8365 -0.4935 -0.0380  0.4640  3.1148 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.501e+02  1.881e+01   7.980 1.81e-15 ***
## fixed.acidity         6.534e-02  2.088e-02   3.129  0.00177 ** 
## volatile.acidity     -1.864e+00  1.138e-01 -16.378  < 2e-16 ***
## citric.acid           2.247e-02  9.579e-02   0.235  0.81457    
## residual.sugar        8.141e-02  7.530e-03  10.810  < 2e-16 ***
## chlorides            -2.489e-01  5.467e-01  -0.455  0.64888    
## free.sulfur.dioxide   3.737e-03  8.445e-04   4.425 9.84e-06 ***
## total.sulfur.dioxide -2.832e-04  3.782e-04  -0.749  0.45403    
## density              -1.502e+02  1.908e+01  -7.871 4.29e-15 ***
## pH                    6.848e-01  1.054e-01   6.494 9.17e-11 ***
## sulphates             6.314e-01  1.004e-01   6.288 3.50e-10 ***
## alcohol               1.936e-01  2.423e-02   7.991 1.65e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7515 on 4884 degrees of freedom
## Multiple R-squared:  0.2819, Adjusted R-squared:  0.2803 
## F-statistic: 174.3 on 11 and 4884 DF,  p-value: < 2.2e-16
white.mse
## [1] 0.5698386
white.mse.del
## [1] 0.5633514


Deleting the potential outlier points has a lot of effect on the regression coefficients. In other words, points 2762 and 4746 had a huge impact on the slope and the intercept of the model. In fact, deleting the points increased the \(R^2\) value from 0.2802 to 0.2862, although it is a slight increase. Also, the mean squared error of the model decreases as well. Hence, we conclude that points 2762 and 4746 are influential.

We will use this adjusted model from now on.


Multiple Linear Regression - Edited Model

However, the improved model still has a low value of \(R^2\). Based on the information we acquired from both the residuals vs. fitted values, we may say that the deficiency of the model is due to trying to fit a discrete quality value to a continuous data.


Outlier Diagnostics : Cook’s Distance

Cook’s distance is one of the ways to measure a point’s influence by considering both the location of a point in the x space and the response variable.

Points with large values of \(D_i\) can be interpreted as an influential point.

white.cooksd = cooks.distance(white.mdl)
white.max.di = max(white.cooksd)
white.max.di
## [1] 3037.942
plot(white.cooksd, pch="*", cex=2, ylab="Cook's Distance", main="Influential Observations by Cook's Distance") 
abline(h = 10*mean(white.cooksd, na.rm=T), col="red")  # adding the cutoff line
text(x=1:length(white.cooksd)+1, y=white.cooksd, labels=ifelse(white.cooksd>10*mean(white.cooksd, na.rm=T),names(white.cooksd),""), col="red")


When interpreting Cook’s distance, we categorize the values as follows:

  • \(D_i\) > 0.5 : The \(i_{th}\) data point is worthy of further investigation as influential
  • \(D_i\) > 1 : The \(i_{th}\) data point is quite likely to be influential
  • If \(D_i\) sticks out from the other values, it is most likely to be influential

Even though the maximum value for the Cook’s distance is 0.02267929, which is way smaller than 0.5, but since it is still larger than other \(D_i\) values, we will take a closer look.

We will fit the model by deleting point 3308.

white.wine.inf = white.wine[-4899, ]
white.mdl.inf = lm(quality ~ . , data=white.wine.inf)
white.sum.inf = summary(white.mdl.inf)
white.mse.inf = mean(white.sum.inf$residuals^2)
white.sum
## 
## Call:
## lm(formula = quality ~ ., data = white.wine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9108 -0.4952 -0.0340  0.4665  3.1740 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.8639971  0.7562491   7.754 1.08e-14 ***
## fixed.acidity        -0.0461155  0.0150720  -3.060  0.00223 ** 
## volatile.acidity     -1.9549206  0.1138344 -17.173  < 2e-16 ***
## citric.acid          -0.0274293  0.0961178  -0.285  0.77537    
## residual.sugar        0.0271816  0.0026015  10.448  < 2e-16 ***
## chlorides            -0.9167898  0.5427403  -1.689  0.09125 .  
## free.sulfur.dioxide   0.0047483  0.0008387   5.662 1.58e-08 ***
## total.sulfur.dioxide -0.0008550  0.0003729  -2.293  0.02191 *  
## density              -3.9026551  0.8366436  -4.665 3.17e-06 ***
## pH                    0.1882683  0.0835603   2.253  0.02430 *  
## sulphates             0.4247037  0.0972815   4.366 1.29e-05 ***
## alcohol               0.3588470  0.0111651  32.140  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7558 on 4887 degrees of freedom
## Multiple R-squared:  0.2732, Adjusted R-squared:  0.2716 
## F-statistic:   167 on 11 and 4887 DF,  p-value: < 2.2e-16
white.sum.inf
## 
## Call:
## lm(formula = quality ~ ., data = white.wine.inf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8348 -0.4934 -0.0379  0.4637  3.1143 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.502e+02  1.880e+01   7.987 1.71e-15 ***
## fixed.acidity         6.552e-02  2.087e-02   3.139  0.00171 ** 
## volatile.acidity     -1.863e+00  1.138e-01 -16.373  < 2e-16 ***
## citric.acid           2.209e-02  9.577e-02   0.231  0.81759    
## residual.sugar        8.148e-02  7.527e-03  10.825  < 2e-16 ***
## chlorides            -2.473e-01  5.465e-01  -0.452  0.65097    
## free.sulfur.dioxide   3.733e-03  8.441e-04   4.422 9.99e-06 ***
## total.sulfur.dioxide -2.857e-04  3.781e-04  -0.756  0.44979    
## density              -1.503e+02  1.907e+01  -7.879 4.04e-15 ***
## pH                    6.863e-01  1.054e-01   6.513 8.10e-11 ***
## sulphates             6.315e-01  1.004e-01   6.291 3.44e-10 ***
## alcohol               1.935e-01  2.422e-02   7.988 1.70e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7514 on 4886 degrees of freedom
## Multiple R-squared:  0.2819, Adjusted R-squared:  0.2803 
## F-statistic: 174.3 on 11 and 4886 DF,  p-value: < 2.2e-16
white.mse
## [1] 0.5698386
white.mse.inf
## [1] 0.5631541


Deleting the point increases the \(R^2\) and the mean squared error value by just a little bit, which is not as desirable. Hence, we conclude that the point 3308 is not influential.

As mentioned above, this may be due to trying to fit a discrete quality value to a continuous data.


Transformation

Previously, the summary of the full model has told us that our model is not doing a good job in fitting the data. And while analyzing the residuals, we realized the fact that fitting a discrete data into a continuous model may be causing such problem.

Before we go into transforamtion on ordinal variables, we will try some conventional transformations.

Log Transformation

First, let’s try a log transformation to see if it will improve our regression.

white.log = lm(log(quality) ~ ., data=white.wine.inf)
white.log.sum = summary(white.log)
white.sum.inf
## 
## Call:
## lm(formula = quality ~ ., data = white.wine.inf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8348 -0.4934 -0.0379  0.4637  3.1143 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.502e+02  1.880e+01   7.987 1.71e-15 ***
## fixed.acidity         6.552e-02  2.087e-02   3.139  0.00171 ** 
## volatile.acidity     -1.863e+00  1.138e-01 -16.373  < 2e-16 ***
## citric.acid           2.209e-02  9.577e-02   0.231  0.81759    
## residual.sugar        8.148e-02  7.527e-03  10.825  < 2e-16 ***
## chlorides            -2.473e-01  5.465e-01  -0.452  0.65097    
## free.sulfur.dioxide   3.733e-03  8.441e-04   4.422 9.99e-06 ***
## total.sulfur.dioxide -2.857e-04  3.781e-04  -0.756  0.44979    
## density              -1.503e+02  1.907e+01  -7.879 4.04e-15 ***
## pH                    6.863e-01  1.054e-01   6.513 8.10e-11 ***
## sulphates             6.315e-01  1.004e-01   6.291 3.44e-10 ***
## alcohol               1.935e-01  2.422e-02   7.988 1.70e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7514 on 4886 degrees of freedom
## Multiple R-squared:  0.2819, Adjusted R-squared:  0.2803 
## F-statistic: 174.3 on 11 and 4886 DF,  p-value: < 2.2e-16
white.log.sum
## 
## Call:
## lm(formula = log(quality) ~ ., data = white.wine.inf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81782 -0.08049  0.00273  0.08392  0.51448 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.653e+01  3.281e+00   8.084 7.81e-16 ***
## fixed.acidity         9.151e-03  3.642e-03   2.512    0.012 *  
## volatile.acidity     -3.491e-01  1.986e-02 -17.579  < 2e-16 ***
## citric.acid           7.227e-03  1.671e-02   0.432    0.665    
## residual.sugar        1.412e-02  1.313e-03  10.752  < 2e-16 ***
## chlorides            -4.601e-02  9.537e-02  -0.482    0.630    
## free.sulfur.dioxide   5.862e-04  1.473e-04   3.980 7.00e-05 ***
## total.sulfur.dioxide -2.687e-05  6.597e-05  -0.407    0.684    
## density              -2.574e+01  3.328e+00  -7.734 1.26e-14 ***
## pH                    1.105e-01  1.839e-02   6.009 2.00e-09 ***
## sulphates             1.102e-01  1.752e-02   6.294 3.37e-10 ***
## alcohol               3.251e-02  4.226e-03   7.692 1.74e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1311 on 4886 degrees of freedom
## Multiple R-squared:  0.2759, Adjusted R-squared:  0.2743 
## F-statistic: 169.2 on 11 and 4886 DF,  p-value: < 2.2e-16


By taking logarthims of quality values, estimates of each of the coefficients changed quite a lot. However, the log transformation decreased the \(R^2\) value, which is not good.

Since our goal was to improve the model to better fit the data, we will skip further analysis on the log transformation model and move on.


Square-Root Transformation

Next, we will do a square-root transformation to see if it will improve our regression.

white.sqr = lm(sqrt(quality) ~ ., data=white.wine.inf)
white.sqr.sum = summary(white.sqr)
white.sum.inf
## 
## Call:
## lm(formula = quality ~ ., data = white.wine.inf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8348 -0.4934 -0.0379  0.4637  3.1143 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.502e+02  1.880e+01   7.987 1.71e-15 ***
## fixed.acidity         6.552e-02  2.087e-02   3.139  0.00171 ** 
## volatile.acidity     -1.863e+00  1.138e-01 -16.373  < 2e-16 ***
## citric.acid           2.209e-02  9.577e-02   0.231  0.81759    
## residual.sugar        8.148e-02  7.527e-03  10.825  < 2e-16 ***
## chlorides            -2.473e-01  5.465e-01  -0.452  0.65097    
## free.sulfur.dioxide   3.733e-03  8.441e-04   4.422 9.99e-06 ***
## total.sulfur.dioxide -2.857e-04  3.781e-04  -0.756  0.44979    
## density              -1.503e+02  1.907e+01  -7.879 4.04e-15 ***
## pH                    6.863e-01  1.054e-01   6.513 8.10e-11 ***
## sulphates             6.315e-01  1.004e-01   6.291 3.44e-10 ***
## alcohol               1.935e-01  2.422e-02   7.988 1.70e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7514 on 4886 degrees of freedom
## Multiple R-squared:  0.2819, Adjusted R-squared:  0.2803 
## F-statistic: 174.3 on 11 and 4886 DF,  p-value: < 2.2e-16
white.sqr.sum
## 
## Call:
## lm(formula = sqrt(quality) ~ ., data = white.wine.inf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87914 -0.10005 -0.00190  0.09852  0.60553 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.220e+01  3.903e+00   8.250  < 2e-16 ***
## fixed.acidity         1.233e-02  4.333e-03   2.846  0.00445 ** 
## volatile.acidity     -4.023e-01  2.362e-02 -17.031  < 2e-16 ***
## citric.acid           6.448e-03  1.988e-02   0.324  0.74566    
## residual.sugar        1.690e-02  1.562e-03  10.817  < 2e-16 ***
## chlorides            -5.292e-02  1.134e-01  -0.467  0.64087    
## free.sulfur.dioxide   7.411e-04  1.752e-04   4.229 2.39e-05 ***
## total.sulfur.dioxide -4.640e-05  7.847e-05  -0.591  0.55436    
## density              -3.099e+01  3.959e+00  -7.827 6.08e-15 ***
## pH                    1.375e-01  2.187e-02   6.285 3.56e-10 ***
## sulphates             1.316e-01  2.084e-02   6.315 2.95e-10 ***
## alcohol               3.961e-02  5.027e-03   7.879 4.02e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.156 on 4886 degrees of freedom
## Multiple R-squared:  0.2803, Adjusted R-squared:  0.2787 
## F-statistic:   173 on 11 and 4886 DF,  p-value: < 2.2e-16


By taking the square-root of quality values, estimates of each of the coefficients changed quite a lot. However, the square-root transformation decreased the \(R^2\) value, which is not good.

Since our goal was to improve the model to better fit the data, we will skip further analysis on the square-root transformation model and move on.


Ordinal Variable Regression

Taking the discreteness of the quality values into consideration, we will now look into transforming ordinal variables.

#white.wine.del$quality = factor(white.wine.del$quality)
#white.tran.mdl = polr(quality ~ . , data=white.wine.del, Hess=TRUE)
#white.tran.sum = summary(white.tran.mdl)


Still figuring out how to do this part …


Variable Selection

As we saw in the beginning, some of the variables did not seem significant to our model. Since incorrect model specification can cause serious problems, we will try to find the most appropriate subset of regressors for our model.

We will first define all the possible candidates then compare their adjusted \(R^2\) values.


Best Subset Regression

First step in choosing the best set of regressors for our model, we will fit all the regression equations involving one, two regressors, and so on. Then we will select the subset of predictors that do the best at meeting some well-defined objective criterion, such as a large adjusted \(R^2\) value or the small MSE, Mallow’s \(C_p\), or AIC.

white.ols = ols_step_best_subset(white.mdl.inf)
white.ols
##                                                                 Best Subsets Regression                                                                 
## --------------------------------------------------------------------------------------------------------------------------------------------------------
## Model Index    Predictors
## --------------------------------------------------------------------------------------------------------------------------------------------------------
##      1         alcohol                                                                                                                                   
##      2         volatile.acidity alcohol                                                                                                                  
##      3         volatile.acidity residual.sugar alcohol                                                                                                   
##      4         volatile.acidity residual.sugar free.sulfur.dioxide alcohol                                                                               
##      5         volatile.acidity residual.sugar density pH alcohol                                                                                        
##      6         volatile.acidity residual.sugar density pH sulphates alcohol                                                                              
##      7         volatile.acidity residual.sugar free.sulfur.dioxide density pH sulphates alcohol                                                          
##      8         fixed.acidity volatile.acidity residual.sugar free.sulfur.dioxide density pH sulphates alcohol                                            
##      9         fixed.acidity volatile.acidity residual.sugar free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol                       
##     10         fixed.acidity volatile.acidity residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol             
##     11         fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol 
## --------------------------------------------------------------------------------------------------------------------------------------------------------
## 
##                                                        Subsets Regression Summary                                                        
## -----------------------------------------------------------------------------------------------------------------------------------------
##                        Adj.        Pred                                                                                                   
## Model    R-Square    R-Square    R-Square      C(p)         AIC           SBIC          SBC          MSEP        FPE       HSP      APC  
## -----------------------------------------------------------------------------------------------------------------------------------------
##   1        0.1897      0.1896      0.1891    618.9350    11684.7824    -2215.6105    11704.2722    3113.5281    0.6359    1e-04    0.8109 
##   2        0.2402      0.2399      0.2393    277.3040    11371.5516    -2528.6907    11397.5379    2920.0529    0.5965    1e-04    0.7607 
##   3        0.2585      0.2581      0.2573    154.8290    11254.1662    -2645.9902    11286.6491    2850.3214    0.5824    1e-04    0.7427 
##   4        0.2640      0.2634      0.2619    119.6254    11219.9116    -2680.2301    11258.8911    2829.8798    0.5784    1e-04    0.7375 
##   5        0.2711      0.2703      0.2689     73.4614    11174.5740    -2725.4969    11220.0500    2803.2347    0.5730    1e-04    0.7307 
##   6        0.2767      0.2758       0.274     37.4649    11138.9039    -2761.0848    11190.8766    2782.3268    0.5689    1e-04    0.7254 
##   7        0.2801      0.2791      0.2766     15.9119    11117.4069    -2782.5146    11175.8762    2769.5776    0.5664    1e-04    0.7222 
##   8        0.2818      0.2806      0.2757      6.8056    11108.2878    -2791.5929    11173.2536    2763.8627    0.5653    1e-04    0.7209 
##   9        0.2818      0.2805      0.2753      8.2383    11109.7192    -2790.1546    11181.1816    2764.1074    0.5655    1e-04    0.7211 
##  10        0.2819      0.2804      0.2751     10.0532    11111.5336    -2788.3345    11189.4926    2764.5684    0.5657    1e-04    0.7214 
##  11        0.2819      0.2803      0.2748     12.0000    11113.4803    -2786.3827    11197.9358    2765.1042    0.5659    1e-04    0.7217 
## -----------------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria 
##  SBIC: Sawa's Bayesian Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
##  MSEP: Estimated error of prediction, assuming multivariate normality 
##  FPE: Final Prediction Error 
##  HSP: Hocking's Sp 
##  APC: Amemiya Prediction Criteria
plot(white.ols)

best.white = lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar + free.sulfur.dioxide +                      density + pH + sulphates + alcohol, data=white.wine.inf)
best.white.sum = summary(best.white)


Looking at the plots above, the increase in \(R^2\) and Adjusted \(R^2\) almost flattens by model 8. In fact, change in Mallow’s \(C_p\) statistic and AIC drastically decreases at model 8 as well.

Hence, best subset regression tells us that model 8, which has the following variables:

  • fixed acidity
  • volatile acidity
  • residual sugar
  • free sulfur dioxide
  • density
  • pH
  • sulphates
  • alcohol

They best meet our criterion, which perfectly aligns with our expectation we had when we saw the large P-values in the full model.


Stepwise Regression Methods

Since comparing all possible regressions and the best subsets take a lot of time and work, we will choose only a few models by adding or deleting regressors one at a time. Here, keep in mind that we used AIC values to compare each steps.


Forward Selection

Start with a model with no regressors included, then we will add regressors as we go on.

start.white.mdl = lm(quality~1, data=white.wine.inf)
scope.white.mdl = lm(quality~., data=white.wine.inf)
step(start.white.mdl, direction = "forward", scope=formula(scope.white.mdl))
## Start:  AIC=-1188.69
## quality ~ 1
## 
##                        Df Sum of Sq    RSS     AIC
## + alcohol               1    728.73 3112.3 -2217.1
## + density               1    362.30 3478.7 -1672.0
## + chlorides             1    169.28 3671.7 -1407.5
## + volatile.acidity      1    145.64 3695.4 -1376.0
## + total.sulfur.dioxide  1    117.28 3723.7 -1338.6
## + fixed.acidity         1     49.62 3791.4 -1250.4
## + pH                    1     37.97 3803.0 -1235.3
## + residual.sugar        1     36.57 3804.4 -1233.5
## + sulphates             1     11.07 3829.9 -1200.8
## <none>                              3841.0 -1188.7
## + citric.acid           1      0.33 3840.7 -1187.1
## + free.sulfur.dioxide   1      0.26 3840.7 -1187.0
## 
## Step:  AIC=-2217.14
## quality ~ alcohol
## 
##                        Df Sum of Sq    RSS     AIC
## + volatile.acidity      1   193.992 2918.3 -2530.4
## + free.sulfur.dioxide   1    56.181 3056.1 -2304.4
## + residual.sugar        1    46.959 3065.3 -2289.6
## + fixed.acidity         1    14.509 3097.8 -2238.0
## + sulphates             1    14.424 3097.8 -2237.9
## + chlorides             1    12.419 3099.8 -2234.7
## + density               1    10.484 3101.8 -2231.7
## + pH                    1     8.442 3103.8 -2228.4
## + citric.acid           1     2.184 3110.1 -2218.6
## + total.sulfur.dioxide  1     2.079 3110.2 -2218.4
## <none>                              3112.3 -2217.1
## 
## Step:  AIC=-2530.37
## quality ~ alcohol + volatile.acidity
## 
##                        Df Sum of Sq    RSS     AIC
## + residual.sugar        1    70.271 2848.0 -2647.8
## + free.sulfur.dioxide   1    40.484 2877.8 -2596.8
## + density               1    25.639 2892.6 -2571.6
## + fixed.acidity         1    16.109 2902.2 -2555.5
## + total.sulfur.dioxide  1    11.165 2907.1 -2547.2
## + sulphates             1    11.006 2907.3 -2546.9
## + pH                    1     5.489 2912.8 -2537.6
## + chlorides             1     4.473 2913.8 -2535.9
## <none>                              2918.3 -2530.4
## + citric.acid           1     0.301 2918.0 -2528.9
## 
## Step:  AIC=-2647.76
## quality ~ alcohol + volatile.acidity + residual.sugar
## 
##                        Df Sum of Sq    RSS     AIC
## + free.sulfur.dioxide   1   21.0028 2827.0 -2682.0
## + density               1   20.8069 2827.2 -2681.7
## + fixed.acidity         1   19.0015 2829.0 -2678.5
## + pH                    1   13.5270 2834.5 -2669.1
## + sulphates             1   13.0677 2834.9 -2668.3
## + total.sulfur.dioxide  1    1.9014 2846.1 -2649.0
## + chlorides             1    1.6387 2846.3 -2648.6
## + citric.acid           1    1.5879 2846.4 -2648.5
## <none>                              2848.0 -2647.8
## 
## Step:  AIC=-2682.01
## quality ~ alcohol + volatile.acidity + residual.sugar + free.sulfur.dioxide
## 
##                        Df Sum of Sq    RSS     AIC
## + density               1   19.0411 2807.9 -2713.1
## + fixed.acidity         1   15.4963 2811.5 -2706.9
## + pH                    1   11.4714 2815.5 -2699.9
## + sulphates             1   11.0877 2815.9 -2699.3
## + total.sulfur.dioxide  1    2.4096 2824.6 -2684.2
## + chlorides             1    2.2177 2824.8 -2683.8
## + citric.acid           1    2.2107 2824.8 -2683.8
## <none>                              2827.0 -2682.0
## 
## Step:  AIC=-2713.11
## quality ~ alcohol + volatile.acidity + residual.sugar + free.sulfur.dioxide + 
##     density
## 
##                        Df Sum of Sq    RSS     AIC
## + pH                    1   23.9315 2784.0 -2753.0
## + sulphates             1   22.2233 2785.7 -2750.0
## + fixed.acidity         1    4.1638 2803.8 -2718.4
## + chlorides             1    1.4137 2806.5 -2713.6
## <none>                              2807.9 -2713.1
## + citric.acid           1    0.4310 2807.5 -2711.9
## + total.sulfur.dioxide  1    0.0843 2807.9 -2711.3
## 
## Step:  AIC=-2753.04
## quality ~ alcohol + volatile.acidity + residual.sugar + free.sulfur.dioxide + 
##     density + pH
## 
##                        Df Sum of Sq    RSS     AIC
## + sulphates             1   18.9648 2765.1 -2784.5
## + fixed.acidity         1    2.9318 2781.1 -2756.2
## <none>                              2784.0 -2753.0
## + chlorides             1    0.5578 2783.5 -2752.0
## + citric.acid           1    0.2232 2783.8 -2751.4
## + total.sulfur.dioxide  1    0.0992 2783.9 -2751.2
## 
## Step:  AIC=-2784.51
## quality ~ alcohol + volatile.acidity + residual.sugar + free.sulfur.dioxide + 
##     density + pH + sulphates
## 
##                        Df Sum of Sq    RSS     AIC
## + fixed.acidity         1    6.2700 2758.8 -2793.6
## <none>                              2765.1 -2784.5
## + chlorides             1    0.5311 2764.5 -2783.5
## + total.sulfur.dioxide  1    0.4328 2764.6 -2783.3
## + citric.acid           1    0.1415 2764.9 -2782.8
## 
## Step:  AIC=-2793.63
## quality ~ alcohol + volatile.acidity + residual.sugar + free.sulfur.dioxide + 
##     density + pH + sulphates + fixed.acidity
## 
##                        Df Sum of Sq    RSS     AIC
## <none>                              2758.8 -2793.6
## + total.sulfur.dioxide  1   0.32024 2758.5 -2792.2
## + chlorides             1   0.10967 2758.7 -2791.8
## + citric.acid           1   0.01298 2758.8 -2791.7
## 
## Call:
## lm(formula = quality ~ alcohol + volatile.acidity + residual.sugar + 
##     free.sulfur.dioxide + density + pH + sulphates + fixed.acidity, 
##     data = white.wine.inf)
## 
## Coefficients:
##         (Intercept)              alcohol     volatile.acidity  
##           1.541e+02            1.932e-01           -1.888e+00  
##      residual.sugar  free.sulfur.dioxide              density  
##           8.285e-02            3.349e-03           -1.543e+02  
##                  pH            sulphates        fixed.acidity  
##           6.942e-01            6.285e-01            6.810e-02
forward.white = lm(quality ~ alcohol + volatile.acidity + residual.sugar + 
    free.sulfur.dioxide + density + pH + sulphates + fixed.acidity, 
    data = white.wine.inf)
forward.white.sum = summary(forward.white)


Our final model has 8 of the 11 possible predictors (written in the order they were added):

  • alcohol
  • volatile acidity
  • residual sugar
  • free sulfur dioxide
  • density
  • pH
  • sulphates
  • fixed acidity

Coincidentally, the chosen 8 variables are identical as our choice for best subsets regression.


Backward Elimination

Start with a model with all the regressors included, then we will eliminate the insignificant regressors as we go on.

start.white.mdl2 = lm(quality~., data=white.wine.inf)
step(start.white.mdl2, direction = "backward")
## Start:  AIC=-2788.44
## quality ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + 
##     chlorides + free.sulfur.dioxide + total.sulfur.dioxide + 
##     density + pH + sulphates + alcohol
## 
##                        Df Sum of Sq    RSS     AIC
## - citric.acid           1     0.030 2758.4 -2790.4
## - chlorides             1     0.116 2758.4 -2790.2
## - total.sulfur.dioxide  1     0.323 2758.7 -2789.9
## <none>                              2758.3 -2788.4
## - fixed.acidity         1     5.562 2763.9 -2780.6
## - free.sulfur.dioxide   1    11.039 2769.4 -2770.9
## - sulphates             1    22.339 2780.7 -2750.9
## - pH                    1    23.948 2782.3 -2748.1
## - density               1    35.044 2793.4 -2728.6
## - alcohol               1    36.020 2794.3 -2726.9
## - residual.sugar        1    66.152 2824.5 -2674.4
## - volatile.acidity      1   151.345 2909.7 -2528.8
## 
## Step:  AIC=-2790.39
## quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
##     chlorides + free.sulfur.dioxide + total.sulfur.dioxide + 
##     density + pH + sulphates + alcohol
## 
##                        Df Sum of Sq    RSS     AIC
## - chlorides             1     0.105 2758.5 -2792.2
## - total.sulfur.dioxide  1     0.315 2758.7 -2791.8
## <none>                              2758.4 -2790.4
## - fixed.acidity         1     5.749 2764.1 -2782.2
## - free.sulfur.dioxide   1    11.096 2769.4 -2772.7
## - sulphates             1    22.444 2780.8 -2752.7
## - pH                    1    23.971 2782.3 -2750.0
## - density               1    35.066 2793.4 -2730.5
## - alcohol               1    36.540 2794.9 -2727.9
## - residual.sugar        1    66.160 2824.5 -2676.3
## - volatile.acidity      1   156.805 2915.2 -2521.6
## 
## Step:  AIC=-2792.2
## quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
##     free.sulfur.dioxide + total.sulfur.dioxide + density + pH + 
##     sulphates + alcohol
## 
##                        Df Sum of Sq    RSS     AIC
## - total.sulfur.dioxide  1     0.320 2758.8 -2793.6
## <none>                              2758.5 -2792.2
## - fixed.acidity         1     6.157 2764.6 -2783.3
## - free.sulfur.dioxide   1    11.036 2769.5 -2774.7
## - sulphates             1    22.570 2781.0 -2754.3
## - pH                    1    25.297 2783.8 -2749.5
## - alcohol               1    36.536 2795.0 -2729.8
## - density               1    36.823 2795.3 -2729.2
## - residual.sugar        1    70.134 2828.6 -2671.2
## - volatile.acidity      1   158.543 2917.0 -2520.5
## 
## Step:  AIC=-2793.63
## quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
##     free.sulfur.dioxide + density + pH + sulphates + alcohol
## 
##                       Df Sum of Sq    RSS     AIC
## <none>                             2758.8 -2793.6
## - fixed.acidity        1     6.270 2765.1 -2784.5
## - free.sulfur.dioxide  1    13.826 2772.6 -2771.2
## - sulphates            1    22.303 2781.1 -2756.2
## - pH                   1    25.460 2784.2 -2750.6
## - alcohol              1    36.300 2795.1 -2731.6
## - density              1    39.920 2798.7 -2725.3
## - residual.sugar       1    72.942 2831.7 -2667.8
## - volatile.acidity     1   167.753 2926.5 -2506.5
## 
## Call:
## lm(formula = quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
##     free.sulfur.dioxide + density + pH + sulphates + alcohol, 
##     data = white.wine.inf)
## 
## Coefficients:
##         (Intercept)        fixed.acidity     volatile.acidity  
##           1.541e+02            6.810e-02           -1.888e+00  
##      residual.sugar  free.sulfur.dioxide              density  
##           8.285e-02            3.349e-03           -1.543e+02  
##                  pH            sulphates              alcohol  
##           6.942e-01            6.285e-01            1.932e-01
backward.white = lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
    free.sulfur.dioxide + density + pH + sulphates + alcohol, 
    data = white.wine.inf)
backward.white.sum = summary(backward.white)


Our final model has 8 of the 11 possible predictors (written in their order in the model):

  • fixed acidity
  • volatile acidity
  • residual sugar
  • free sulfur dioxide
  • density
  • pH
  • sulphates
  • alcohol

Again, the chosen 8 variables are identical as our choice for best subsets regression and forward selection.


Stepwise Regression

Repeat forward selection and backward elimination.

step(white.mdl.inf, direction= "both")
## Start:  AIC=-2788.44
## quality ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + 
##     chlorides + free.sulfur.dioxide + total.sulfur.dioxide + 
##     density + pH + sulphates + alcohol
## 
##                        Df Sum of Sq    RSS     AIC
## - citric.acid           1     0.030 2758.4 -2790.4
## - chlorides             1     0.116 2758.4 -2790.2
## - total.sulfur.dioxide  1     0.323 2758.7 -2789.9
## <none>                              2758.3 -2788.4
## - fixed.acidity         1     5.562 2763.9 -2780.6
## - free.sulfur.dioxide   1    11.039 2769.4 -2770.9
## - sulphates             1    22.339 2780.7 -2750.9
## - pH                    1    23.948 2782.3 -2748.1
## - density               1    35.044 2793.4 -2728.6
## - alcohol               1    36.020 2794.3 -2726.9
## - residual.sugar        1    66.152 2824.5 -2674.4
## - volatile.acidity      1   151.345 2909.7 -2528.8
## 
## Step:  AIC=-2790.39
## quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
##     chlorides + free.sulfur.dioxide + total.sulfur.dioxide + 
##     density + pH + sulphates + alcohol
## 
##                        Df Sum of Sq    RSS     AIC
## - chlorides             1     0.105 2758.5 -2792.2
## - total.sulfur.dioxide  1     0.315 2758.7 -2791.8
## <none>                              2758.4 -2790.4
## + citric.acid           1     0.030 2758.3 -2788.4
## - fixed.acidity         1     5.749 2764.1 -2782.2
## - free.sulfur.dioxide   1    11.096 2769.4 -2772.7
## - sulphates             1    22.444 2780.8 -2752.7
## - pH                    1    23.971 2782.3 -2750.0
## - density               1    35.066 2793.4 -2730.5
## - alcohol               1    36.540 2794.9 -2727.9
## - residual.sugar        1    66.160 2824.5 -2676.3
## - volatile.acidity      1   156.805 2915.2 -2521.6
## 
## Step:  AIC=-2792.2
## quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
##     free.sulfur.dioxide + total.sulfur.dioxide + density + pH + 
##     sulphates + alcohol
## 
##                        Df Sum of Sq    RSS     AIC
## - total.sulfur.dioxide  1     0.320 2758.8 -2793.6
## <none>                              2758.5 -2792.2
## + chlorides             1     0.105 2758.4 -2790.4
## + citric.acid           1     0.019 2758.4 -2790.2
## - fixed.acidity         1     6.157 2764.6 -2783.3
## - free.sulfur.dioxide   1    11.036 2769.5 -2774.7
## - sulphates             1    22.570 2781.0 -2754.3
## - pH                    1    25.297 2783.8 -2749.5
## - alcohol               1    36.536 2795.0 -2729.8
## - density               1    36.823 2795.3 -2729.2
## - residual.sugar        1    70.134 2828.6 -2671.2
## - volatile.acidity      1   158.543 2917.0 -2520.5
## 
## Step:  AIC=-2793.63
## quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
##     free.sulfur.dioxide + density + pH + sulphates + alcohol
## 
##                        Df Sum of Sq    RSS     AIC
## <none>                              2758.8 -2793.6
## + total.sulfur.dioxide  1     0.320 2758.5 -2792.2
## + chlorides             1     0.110 2758.7 -2791.8
## + citric.acid           1     0.013 2758.8 -2791.7
## - fixed.acidity         1     6.270 2765.1 -2784.5
## - free.sulfur.dioxide   1    13.826 2772.6 -2771.2
## - sulphates             1    22.303 2781.1 -2756.2
## - pH                    1    25.460 2784.2 -2750.6
## - alcohol               1    36.300 2795.1 -2731.6
## - density               1    39.920 2798.7 -2725.3
## - residual.sugar        1    72.942 2831.7 -2667.8
## - volatile.acidity      1   167.753 2926.5 -2506.5
## 
## Call:
## lm(formula = quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
##     free.sulfur.dioxide + density + pH + sulphates + alcohol, 
##     data = white.wine.inf)
## 
## Coefficients:
##         (Intercept)        fixed.acidity     volatile.acidity  
##           1.541e+02            6.810e-02           -1.888e+00  
##      residual.sugar  free.sulfur.dioxide              density  
##           8.285e-02            3.349e-03           -1.543e+02  
##                  pH            sulphates              alcohol  
##           6.942e-01            6.285e-01            1.932e-01
both.white = lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
    free.sulfur.dioxide + density + pH + sulphates + alcohol, 
    data = white.wine.inf)
both.white.sum = summary(both.white)


Our final model has 8 of the 11 possible predictors (written in their order in the model):

  • fixed acidity
  • volatile acidity
  • residual sugar
  • free sulfur dioxide
  • density
  • pH
  • sulphates
  • alcohol

As it turns out, the chosen 8 variables are identical for all our variable selection methods.


Now, we will compare all our models by their adjusted \(R^2\).

# Adjusted R-squared for the full model
white.rsq = white.sum.inf$adj.r.squared
white.rsq
## [1] 0.2802536
# Adjusted R-squared for best subset regression model
best.white.rsq = best.white.sum$adj.r.squared
best.white.rsq
## [1] 0.2805767
# Adjusted R-squared for forward selection model
forward.white.rsq = forward.white.sum$adj.r.squared
forward.white.rsq
## [1] 0.2805767
# Adjusted R-squared for backward elimination model
backward.white.rsq = backward.white.sum$adj.r.squared
backward.white.rsq
## [1] 0.2805767
# Adjusted R-squared for stepwise regression model
both.white.rsq = both.white.sum$adj.r.squared
both.white.rsq
## [1] 0.2805767


Clearly, even after selecting the appropriate regressors, the adjusted \(R^2\) did not improve. In other words, the subset models do not show much of a higher performance. In fact, as we saw above, all the variable selection models choose the same 8 variables and even generates equal adjusted \(R^2\) values.

Hence, from now on, the best model used will be

Quality = \(\beta_0\) + \(\beta_1\) * fixed acidity + \(\beta_2\) * volatile acidity + \(\beta_3\) * residual sugar + \(\beta_4\) * free sulfur dioxide + \(\beta_5\) * density + \(\beta_6\) * pH + \(\beta_7\) * sulphates + \(\beta_8\) * alcohol.

However, although these are all the variable selection method we learned, we should always keep in mind that there will be several equally good models. That is, we might be ignorant of the background knowledge of the collected data, i.e. there may be a whole other variable that affects wine quality.

With that in mind, we will assess our final model by investigating the residual plots.


Residual Analysis

fin.white = lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
    free.sulfur.dioxide + density + pH + sulphates + alcohol, 
    data = white.wine.inf)


Residual vs. Fitted
plot(fin.white, which=1)


Comments on the residual vs. fitted plot


Normal Q-Q
plot(fin.white, which=2)


We observe that the residuals for our model meet the normality assumption. The negative skew that we saw in the full model has reduced, and we cannot really spot potential influential points, which is a good thing.


Scale-Location
plot(fin.white, which=3)


Comments on the Scale-Location Plot


Residuals vs. Leverage
par(mfrow = c(2,2))
plot(fin.white, which=1)
plot(fin.white, which=2)
plot(fin.white, which=3)
plot(fin.white, which=5)


Comments on the Residuals vs. Leverage Plot


Overall Residual Plots
par(mfrow = c(2,2))
plot(fin.white, which=1)
plot(fin.white, which=2)
plot(fin.white, which=3)
plot(fin.white, which=5)


white.cooksd2 = cooks.distance(fin.white)
white.max.di2 = max(white.cooksd2)
white.max.di2
## [1] 1.296286
plot(white.cooksd2, pch="*", cex=2, ylab="Cook's Distance", main="Influential Observations by Cook's Distance for White Wine") 
abline(h = 10*mean(white.cooksd2, na.rm=T), col="red")  # adding the cutoff line
text(x=1:length(white.cooksd2)+1, y=white.cooksd2, labels=ifelse(white.cooksd2>10*mean(white.cooksd2, na.rm=T),names(white.cooksd2),""), col="red")

## Model Validation

Before we jump to any conclusions, we should check the validity of our model. There is a good chance that this data was made so that one can predict the quality of a wine based on specific chemical attributes.

We will perform cross validation in order to see how well our model predicts the quality of white wine. We do this by dividing the dataset in such a way that 80 percent of the dataset is part of the training set and 20 percent of the dataset is the testing set.

Then we will compare the actual value to the predicted value. We repeat the steps of validation 5 times to check the values of R squared, root mean square error and mean absolute error, which will tell us how the model behaves.


Cross Validation - Full Model

First, we will see the prediction power of our full model with all our regressors.

set.seed(71168)
sample.white.n = ceiling(0.8*length(white.wine.inf$quality))

for(i in 1:5){
  train.white.sample = sample(c(1:length(white.wine.inf$quality)),sample.white.n)
  train.white.sample = sort(train.white.sample)
  
  train.white.data = white.wine.inf[train.white.sample,]
  test.white.data = white.wine.inf[-train.white.sample,]
  
  train.white.mdl = lm(train.white.data$quality~., data = train.white.data)
  summary(train.white.mdl) 
  
  preds.white = predict(train.white.mdl,test.white.data)
  
  plot(test.white.data$quality,preds.white,xlim=c(4,10),ylim=c(4,10))
  abline(c(0,1))
  
  # R-squared
  R.sq = R2(preds.white, test.white.data$quality)
  
  #RMSPE
  RMSPE = RMSE(preds.white, test.white.data$quality)
  
  #MAPE
  MAPE = MAE(preds.white, test.white.data$quality)
  
  print(c(i,R.sq,RMSPE,MAPE))
}

## [1] 1.0000000 0.2780152 0.7504382 0.5866302

## [1] 2.0000000 0.2273238 0.7712567 0.6016954

## [1] 3.0000000 0.3076548 0.7370168 0.5790178

## [1] 4.0000000 0.2858487 0.7760121 0.5936838

## [1] 5.0000000 0.2730520 0.7367243 0.5665545


Here are the things that we notice from our numerical output.

  • \(R^2\) value remains to be small. An ideal model explains at least 80% of the data, while only 31.0% of our model could be explained, which is pretty bad.
  • Root mean square error (or MSPE), the difference between the predicted quality and test quality, is pretty high. Our goal is to have as small a difference as possible, but in our case, it goes up to 0.79.
  • Mean square error (or MSE) is also pretty high, which is not good.


Now, here are the things that we see from our graphical observations.

  • When predicting quality 4, the model tends to overestimate the value.
  • When predicting quality 7 and 8, the model tends to underestimate the value.
  • Some of the observations are predictd to have quality 9, even though it is a bit underestimated.
  • Our model does predict quality 5 and 6, the huge variability around the value implies that many points are further away from the centre. Quality 5, especially, seems to be frequently overestimated.


Cross Validation - Final Model

Next, let’s take a look at the prediction power of our final model with selected regressors.

set.seed(71168)
sample.white.n = ceiling(0.8*length(white.wine.inf$quality))

par(mfrow = c(3,3))
for(i in 1:5){
  train.white.sample = sample(c(1:length(white.wine.inf$quality)),sample.white.n)
  train.white.sample = sort(train.white.sample)
  
  train.white.data = white.wine.inf[train.white.sample,]
  test.white.data = white.wine.inf[-train.white.sample,]
  
  train.white.fin = lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
    free.sulfur.dioxide + density + pH + sulphates + alcohol, data = train.white.data)
  summary(train.white.fin) 
  
  preds.white.fin = predict(train.white.fin,test.white.data)
  
  plot(test.white.data$quality,preds.white.fin,xlim=c(4,10),ylim=c(4,10))
  abline(c(0,1))
  
  # R-squared
  R.sq = R2(preds.white.fin, test.white.data$quality)
  
  #RMSPE
  RMSPE = RMSE(preds.white.fin, test.white.data$quality)
  
  #MAPE
  MAPE = MAE(preds.white.fin, test.white.data$quality)
  
  print(c(i,R.sq,RMSPE,MAPE))
}
## [1] 1.0000000 0.2786311 0.7501360 0.5865241
## [1] 2.0000000 0.2284019 0.7706200 0.6016314
## [1] 3.0000000 0.3102651 0.7356908 0.5786415
## [1] 4.0000000 0.2855969 0.7761478 0.5938715
## [1] 5.0000000 0.2738739 0.7362855 0.5667910


Both the numerical and graphical observations do not have a lot of difference from our full model.

Hence we conclude that both our models have no prediction powers.


Conclusion

This is where conclusion will take place.


References

Here are the works cited.

Montgomery, D. C., Peck, E. A., & Vining, G. G. (2012). Introduction to Linear Regression Analysis - 5th ed. Hoboken, New Jersey: John Wiley & Sons, Inc.

P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.